Team 2: Finalized Merged Jupyter Notebook

Part I: Authors Datasets & Plots

Importing dataset and analysing its content
In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import seaborn as sns
In [2]:
df = pd.read_csv('good_reads_final.csv')
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22891 entries, 0 to 22890
Data columns (total 20 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   author_average_rating  22891 non-null  float64
 1   author_gender          22891 non-null  object 
 2   author_genres          22891 non-null  object 
 3   author_id              22891 non-null  int64  
 4   author_name            22891 non-null  object 
 5   author_page_url        22891 non-null  object 
 6   author_rating_count    22891 non-null  int64  
 7   author_review_count    22891 non-null  int64  
 8   birthplace             22891 non-null  object 
 9   book_average_rating    22891 non-null  float64
 10  book_fullurl           22891 non-null  object 
 11  book_id                22891 non-null  object 
 12  book_title             22891 non-null  object 
 13  genre_1                22891 non-null  object 
 14  genre_2                22891 non-null  object 
 15  num_ratings            22891 non-null  int64  
 16  num_reviews            22891 non-null  int64  
 17  pages                  22891 non-null  object 
 18  publish_date           22610 non-null  object 
 19  score                  22891 non-null  int64  
dtypes: float64(2), int64(6), object(12)
memory usage: 3.5+ MB
In [3]:
df.describe()
Out[3]:
author_average_rating author_id author_rating_count author_review_count book_average_rating num_ratings num_reviews score
count 22891.000000 2.289100e+04 2.289100e+04 22891.000000 22891.000000 2.289100e+04 22891.000000 22891.000000
mean 3.960368 3.233957e+06 1.720319e+05 9369.795640 3.951456 4.668349e+04 2324.754794 3893.028483
std 0.240421 3.868091e+06 6.546902e+05 24949.832609 0.291317 1.800698e+05 6837.503574 11022.018268
min 1.820000 4.000000e+00 6.000000e+00 0.000000 0.000000 0.000000e+00 0.000000 55.000000
25% 3.810000 4.083550e+04 4.324500e+03 545.000000 3.770000 8.200000e+02 106.000000 832.000000
50% 3.970000 1.415543e+06 2.463500e+04 2273.000000 3.960000 4.403000e+03 384.000000 1727.000000
75% 4.120000 5.775601e+06 1.113370e+05 8262.500000 4.140000 2.014300e+04 1504.000000 3597.500000
max 5.000000 1.877045e+07 2.111732e+07 516745.000000 5.000000 3.820921e+06 147696.000000 598270.000000
In [4]:
df.head()
Out[4]:
author_average_rating author_gender author_genres author_id author_name author_page_url author_rating_count author_review_count birthplace book_average_rating book_fullurl book_id book_title genre_1 genre_2 num_ratings num_reviews pages publish_date score
0 4.01 female historical-fiction, 74489 Victoria Thompson\n /author/show/74489.Victoria_Thompson 74399 6268 United States\n 4.02 https://www.goodreads.com/book/show/686717.Mur... 686717 \n Murder on St. Mark's Place\n Mystery Historical 5260 375 277 2000 3230
1 4.15 male literature-fiction,mystery-thrillers, 706255 Stieg Larsson\n /author/show/706255.Stieg_Larsson 3726435 142704 Sweden\n 4.13 https://www.goodreads.com/book/show/2429135.Th... 2429135 \n The Girl with the Dragon Tattoo\n Fiction Mystery 2229163 65227 465 August 2005 3062
2 4.00 female romance, 5618190 Mimi Jean Pamfiloff\n /author/show/5618190.Mimi_Jean_Pamfiloff 76496 7975 United States\n 3.99 https://www.goodreads.com/book/show/27833684-t... 27833684 \n Tailored for Trouble\n Romance Contemporary 2151 391 354 2016 4585
3 3.88 male fiction,memoir, 37871 José Donoso\n /author/show/37871.Jos_Donoso 5522 489 Chile\n 4.14 https://www.goodreads.com/book/show/382975.The... 382975 \n The Obscene Bird of Night\n Fiction Magical Realism 1844 173 438 1970 1533
4 4.10 female young-adult,fantasy, 36122 Patricia C. Wrede\n /author/show/36122.Patricia_C_Wrede 291013 13453 United States\n 4.01 https://www.goodreads.com/book/show/64207.Sorc... 64207 \n Sorcery & Cecelia: or The Enchanted Ch... Fantasy Young Adult 17051 1890 326 April 15th 1988 2105
In [ ]:
# df_csv = df[['author_name','genre_1','birthplace']]
# df_csv = df_csv[df_csv['author_name'].notna()]
# df_csv = df_csv[df_csv['genre_1'].notna()]
# df_csv = df_csv[df_csv['birthplace'].notna()]
# df_csv['author_name'] = df_csv['author_name'].str.replace(r'\n', '')
# df_csv['birthplace'] = df_csv['birthplace'].str.replace(r'\n', '')

# df_csv = df_csv.head(5)
# df_csv

# df_csv.to_csv('the_csv_file.csv',sep='/t', index=False)
Plotting average ratings of authors
In [6]:
df_count_author = df.groupby(df['author_id']).agg({'author_id':'count'})
In [7]:
df_unique_autor = df['author_id'].unique()
df_unique_autor
Out[7]:
array([  74489,  706255, 5618190, ..., 8386732, 6543639, 5246010])
In [8]:
new_df = pd.DataFrame(df_unique_autor)
In [9]:
len_group_author = len(new_df.index)
In [10]:
len_non_group_author = len(df.index)
In [11]:
df_top_ratings =   df[(df['score']>df['score'].quantile(0.9))
                    & (df['author_average_rating']>df['author_average_rating'].quantile(0.9)) 
                    & (df['author_rating_count']>df['author_rating_count'].quantile(0.9))
                    & (df['author_review_count']>df['author_review_count'].quantile(0.9))
                    & (df['book_average_rating']>df['book_average_rating'].quantile(0.9))
                    & (df['num_ratings']>df['num_ratings'].quantile(0.8))]

df_top_ratings = df_top_ratings.sort_values(by=['author_average_rating'])

fig_dims = (14, 7)
fig, ax = plt.subplots(figsize=fig_dims,frameon=False)
ax = sns.barplot(x="author_average_rating", y="author_name", data=df_top_ratings, palette="Blues_d")

ax.set_xlim(4.1,4.6)
for p in ax.patches:
    ax.annotate(format(p.get_width(), '.2f'), 
                   ( p.get_width()*1.003 , p.get_y() + p.get_height() + 0.1), 
                   ha = 'center', va = 'center', 
                   xytext = (0, 9), 
                   textcoords = 'offset points')
plt.savefig('author_ratings.png',dpi = 200)
Plotting rate count and rate review count
In [12]:
df_counts =   df[((df['author_rating_count']+df['author_review_count']) > 3e+6)]
df_counts["counts"] = df_counts['author_rating_count']+df_counts['author_review_count']
df_counts = df_counts.sort_values(by=['counts'])

f, ax = plt.subplots(figsize = (20,10))
sns.set_color_codes('pastel')
sns.barplot(x = 'counts', y = 'author_name', data = df_counts,
            label = 'Rate Count', palette = 'Blues_d', edgecolor = 'w')
sns.set_color_codes('muted')
sns.barplot(x = 'author_review_count', y = 'author_name', data = df_counts,
            label = 'Review Count', palette = 'YlOrBr', edgecolor = 'w')
ax.legend(ncol = 2, loc = 'upper right')
sns.despine(left = True, bottom = True)
plt.show()
<ipython-input-12-3bb12012d206>:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_counts["counts"] = df_counts['author_rating_count']+df_counts['author_review_count']
In [13]:
#unique genres
unique_genres = []
genre_1 = list(df['genre_1'])
genre_2 = list(df['genre_2'])
for genre in genre_1:
    if len(genre)>0 and genre not in unique_genres:
        unique_genres.append(genre)
for genre in genre_2:
    if len(genre)>0 and genre not in unique_genres:
        unique_genres.append(genre)
In [14]:
print("There are {} unique genres in the dataset".format(len(unique_genres)))
There are 168 unique genres in the dataset
In [15]:
#All unique genres
np.unique(df['author_gender'])
Out[15]:
array(['female', 'male'], dtype=object)
In [16]:
genre="Mystery"
In [17]:
temp_df_1= df[df['genre_1'] == genre] #rows with the genre_1 column=genre
temp_df_2 = df[df['genre_2'] == genre]#rows with the genre_2 columns=genre
In [18]:
concatinated = pd.concat([temp_df_1,temp_df_2], axis=0) #concatinate the two dataframes
len(concatinated) ==len(temp_df_1) + len(temp_df_2)
Out[18]:
True
In [19]:
concatinated.head()
df_gif = concatinated[['genre_1','genre_2','publish_date']]
In [20]:
female = concatinated[concatinated['author_gender']=='female']
male = concatinated[concatinated['author_gender']=='male']
print("There are {} women and {} men in genre {}".format(len(female), len(male), genre))
There are 741 women and 744 men in genre Mystery
Male and Female authors For all genres
In [21]:
genres_women_men = {}
for genre in unique_genres:
    temp_df_1= df[df['genre_1'] == genre] #rows with the genre_1 column=genre
    temp_df_2 = df[df['genre_2'] == genre]#rows with the genre_2 columns=genre
    concatinated = pd.concat([temp_df_1,temp_df_2], axis=0) #concatinate the two dataframes
    female = concatinated[concatinated['author_gender']=='female']
    male = concatinated[concatinated['author_gender']=='male']
    genres_women_men[genre] = [len(female), len(male)]
In [22]:
genres_women_men['Horror'][0] #women in horror genre
Out[22]:
178
In [23]:
new_dict = {'Genre':[], 'Females':[], 'Males':[], 'Total':[]}
for key,value in genres_women_men.items():
    new_dict['Genre'].append(key)
    new_dict['Females'].append(value[0])
    new_dict['Males'].append(value[1])
    new_dict['Total'].append(value[1]+value[0])
In [24]:
df_counts = pd.DataFrame.from_dict(new_dict)
df_counts.sort_values(by=['Total'])
Out[24]:
Genre Females Males Total
167 Roman 0 1 1
141 Martial Arts 1 0 1
136 Folk Tales 1 0 1
134 Nobel Prize 1 0 1
133 Currency 0 1 1
... ... ... ... ...
8 Historical 1293 1019 2312
5 Young Adult 2769 1797 4566
1 Fiction 1682 3420 5102
3 Fantasy 3011 2624 5635
2 Romance 4415 2024 6439

168 rows × 4 columns

In [25]:
df_counts_top = df_counts.sort_values(by=['Total']).tail(30)
df_counts_top['unknown'] = df_counts_top['Males']/10

f, ax = plt.subplots(figsize = (15,8))
sns.set_color_codes('pastel')
sns.barplot(x = 'Total', y = 'Genre', data = df_counts_top,
            label = 'Males', palette = 'Blues_d', edgecolor = 'w')
sns.set_color_codes('muted')
sns.barplot(x = 'Females', y = 'Genre', data = df_counts_top,
            label = 'Females', palette = 'YlOrBr', edgecolor = 'w')
sns.barplot(x = 'unknown', y = 'Genre', data = df_counts_top,
            label = 'Other', color = 'grey', edgecolor = 'w')
ax.legend( loc = 'upper right')
sns.despine(left = True, bottom = True)
plt.savefig('gender.png')

plt.show()
In [26]:
# Initialing the spiderplot by  
# setting figure size and polar 
# projection 
df_counts_top = df_counts_top.sample(frac=1).reset_index(drop=True)
plt.figure(figsize =(10, 10)) 
plt.subplot(polar = True) 
    
theta = np.linspace(0, 2 * np.pi, len(df_counts_top)+1) 
    
# Arranging the grid into number  
# of sales into equal parts in 
# degrees 
lines, labels = plt.thetagrids(range(0, 360, int(360/len(df_counts_top['Genre']))), 
                                                         list(df_counts_top['Genre'])) 
    
# Plot actual sales graph 
plt.plot(theta, list(df_counts_top['Total'])+[df_counts_top['Total'][0]]) 
plt.fill(theta, list(df_counts_top['Total'])+[df_counts_top['Total'][0]], 'b', alpha = 0.1) 
    
# # Add legend and title for the plot 
plt.title("Genres for 20th Century") 
    
# # Display the plot on the screen 
plt.show() 
In [ ]:
 
In [27]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import matplotlib.animation as animation
import csv
from pandas import DataFrame
import seaborn as sns
from IPython.display import HTML
import collections

#For map
#conda install basemap
In [28]:
#error_bad_lines : boolean, default True Lines with too many fields (e.g. a csv line with too many commas) 
#will by default cause an exception to be raised, and no DataFrame will be returned. If False, then these 
#“bad lines” will dropped from the DataFrame that is returned. (Only valid with C parser)

df = pd.read_csv('final_dataset.csv',error_bad_lines = False) 

df_map = df.drop(columns=['authorid','about','website','twitter','original_hometown'])
#Drop the unwanted columns in the dataset
df = df.drop(columns=['authorid','about','website','twitter','original_hometown','latitude','longitude'])

#Drop the entries not in English
# for j in range(len(df)):
#     for letters in df['name'][j]:
#         if letters.isalpha() == False:
#             df = df.drop([j])
# for j in range(len(df)):
#     letters_list = list(df.iloc[j,0])
#     for letter in letters_list:
#         if letter.isalpha()==False :
#             df = df.drop([j])
        

# print("There are {} rows and {} columns in the dataset.".format(df_authors.shape[0], df_authors.shape[1]))
In [29]:
#overview of the dataframe
df.isnull().sum()
Out[29]:
name                 0
workcount            0
fan_count            0
gender               0
image_url            0
born            178287
died            197029
influence       201635
average_rate         0
rating_count         0
review_count         0
genre           135534
country         164918
dtype: int64
In [30]:
df = df[df['born'].notna()]
In [31]:
df = df[df['country'].notna()]
In [32]:
df = df[df['genre'].notna()]
df.head()
Out[32]:
name workcount fan_count gender image_url born died influence average_rate rating_count review_count genre country
13 John "Red" Shea 4 3 male https://images.gr-assets.com/authors/131048078... 1965-08-12 NaN NaN 3.55 514 56 biographies and memoirs United States
15 Ricardo Ferrari 31 3 male https://images.gr-assets.com/authors/135516310... 1957-01-18 NaN NaN 3.68 345 31 comics Argentina
45 محمد علي البار 54 37 male https://images.gr-assets.com/authors/130966185... 1939-12-29 NaN NaN 3.65 230 36 religion and spirituality,science,spirituality Yemen
51 Keith Roberts 118 24 male https://s.gr-assets.com/assets/nophoto/user/m_... 1935-09-20 2000-10-05 NaN 3.69 5402 552 fantasy,fiction United Kingdom
54 Shuho Sato 68 1 male https://images.gr-assets.com/authors/157798730... 1973-12-08 NaN NaN 3.82 724 27 comics,graphic novels,manga Japan
In [33]:
df = df.reset_index(drop=True)
In [34]:
for i in range(len(df)):
    df.loc[i,'born'] = df.loc[i,'born'][0:4]
In [35]:
df = df.sort_values(by=['born'])
df = df.reset_index(drop=True)
df_born = df[['born','genre']]
df_born['born'] = df_born.born.astype(int)
df_born.head()
<ipython-input-35-94f7895c4553>:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_born['born'] = df_born.born.astype(int)
Out[35]:
born genre
0 1680 philosophy,poetry
1 1684 fiction,history,literature,philosophy
2 1685 drama,poetry
3 1685 music
4 1688 poetry
In [36]:
year_0 = 1680
i = 0
for i in range(len(list(df_born['born']))):
    if ((int(df_born['born'][i]) >= year_0) and (int(df_born['born'][i]) < year_0 + 20)):
        df_born['born'][i] = year_0
    if int(df_born['born'][i]) >= (year_0 + 20):
        year_0 = year_0 + 20
        df_born['born'][i] = year_0
df_born.head()
<ipython-input-36-65ad4fbb82a7>:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_born['born'][i] = year_0
/Users/zhenweiliu/opt/anaconda3/lib/python3.8/site-packages/IPython/core/interactiveshell.py:3418: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exec(code_obj, self.user_global_ns, self.user_ns)
<ipython-input-36-65ad4fbb82a7>:8: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_born['born'][i] = year_0
Out[36]:
born genre
0 1680 philosophy,poetry
1 1680 fiction,history,literature,philosophy
2 1680 drama,poetry
3 1680 music
4 1680 poetry
In [37]:
b = DataFrame(df_born.genre.str.split(',').tolist(), index=df_born.born).stack()
b = b.reset_index()[[0, 'born']] # var1 variable is currently labeled 0
b.columns = ['genre', 'born'] # renaming var
In [38]:
df_list = b.groupby('born')['genre'].apply(list).to_dict()
In [39]:
# df_list_year = collections.Counter(df_list.get(year))
# dff = pd.DataFrame.from_dict(df_list_year.items())
# dff.sort_values(by=[1]).tail(10)
In [40]:
# import random

# list_co = df_born.genre.unique()
# list_colors = []
# for i in range(len(list_co)):
    
#     r = random.random()
#     b = random.random()
#     g = random.random()
#     color = (r, g, b)
    
#     if color not in list_colors:
#         list_colors.append(color)    
    

# colors = dict(zip(list_co,list_colors))
# colors['fiction']
In [41]:
# Poetry Highlighted

list_co = df_born.genre.unique()
list_colors = []
for i in range(len(list_co)):
    

    color = 'grey'
    if list_co[i] == 'poetry':
#         color = (0.9,0.9,0.9)
        list_colors.append('r')  
    else:
        list_colors.append(color)    
    

colors = dict(zip(list_co,list_colors))
In [42]:
df_list_year = collections.Counter(df_list.get(1800))
dff_test = pd.DataFrame.from_dict(df_list_year.items())
dff_test[0].unique()
Out[42]:
array(['art', 'non fiction', 'science', 'fiction', 'history',
       'literature', 'poetry', 'discipleship', 'doctrine',
       'religion and spirituality', 'classical liberal', 'philosophy',
       'political', 'comedy', 'entertainment', 'spirituality',
       'adventure', 'romance', 'fantasy', 'biographies and memoirs',
       'essays', 'short stories', 'children', 'photography',
       'social studies', 'travel', 'classics', 'paranormal', 'psychology',
       'anthropology', 'early archaeology', 'fairytales', 'biology',
       'geology', 'ethnography', 'calvinism', 'linguistics', 'evolution',
       'horror', 'mystery and thrillers', 'drama', 'medicine',
       'reference', 'music', 'business', 'investing', 'gender',
       'rhetoric', 'self help', 'civil rights', 'race relations',
       'hebrew', 'nature', 'outdoors', 'young adult', 'theater',
       'economics', 'animal', 'crime', 'piano', 'contemporary', 'diary'],
      dtype=object)

Popular genres</div

In [43]:
fig, ax = plt.subplots(figsize=(15, 8))
def draw_barchart(year):
    """
    given a year, plots the popular genres of it
    """
    assert isinstance(year, int)
    assert year <=2021 and year >=0
    df_list_year = collections.Counter(df_list.get(year))
    dff = pd.DataFrame.from_dict(df_list_year.items())
    dff = dff.sort_values(by=[1]).tail(10)
#     dff = dff[['romance','history','poetry','political','religion and spirituality','crime','mystery and thrillers','biographies and memoirs','fantasy','philosophy']]
    
    # pass colors values to `color=`
    ax.clear()
    ax.barh(dff[0], dff[1], color=[colors[str(x)] for x in dff[0]])
    
    dx = dff[1].max() / 200
    for i, (value, name) in enumerate(zip(dff[1], dff[0])):
        ax.text(value+dx, i,     name.title() ,           size=14, weight=600, ha='left', va='center')
        ax.text(value-dx, i,     f'{value:,.0f}K',  size=14, weight = 600, color = 'w', ha='right',  va='center')
    # ... polished styles
    ax.text(1, 0.4, year, transform=ax.transAxes, color='#777777', size=46, ha='right', weight=800)
    #ax.text(0, 1.06, 'Population (thousands)', transform=ax.transAxes, size=12, color='#777777')
    ax.xaxis.set_major_formatter(ticker.StrMethodFormatter('{x:,.0f}'))
    ax.xaxis.set_ticks_position('top')
    ax.tick_params(axis='x', colors='#777777', labelsize=12)
    ax.set_yticks([])
    ax.margins(0, 0.01)
    ax.grid(which='major', axis='x', linestyle='-')
    ax.set_axisbelow(True)
    ax.text(0, 1.12, 'Popular Genres from 1800 to 1990',
            transform=ax.transAxes, size=24, weight=600, ha='left')
    ax.text(0, 1.06, 'Published Records (in Thousands)', transform=ax.transAxes, size=12, color='#777777')
    plt.box(False)
    
draw_barchart(1980)
In [44]:
import matplotlib.animation as animation
from IPython.display import HTML
fig, ax = plt.subplots(figsize=(15, 8))
animator = animation.FuncAnimation(fig, draw_barchart, frames=range(1800, 2000, 20))
HTML(animator.to_jshtml()) 
# or use animator.to_html5_video() or animator.save()
Out[44]:
In [45]:
#animator.save('poetry_gif.gif', writer='Pillow', fps=1, dpi = 200)
importing libraries
In [46]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import matplotlib.animation as animation
import csv
from pandas import DataFrame
import seaborn as sns
from IPython.display import HTML
import collections

#For map
#conda install basemap
In [47]:
#error_bad_lines : boolean, default True Lines with too many fields (e.g. a csv line with too many commas) 
#will by default cause an exception to be raised, and no DataFrame will be returned. If False, then these 
#“bad lines” will dropped from the DataFrame that is returned. (Only valid with C parser)

df = pd.read_csv('final_dataset.csv',error_bad_lines = False) 

df_map = df.drop(columns=['authorid','about','website','twitter','original_hometown'])
#Drop the unwanted columns in the dataset
df = df.drop(columns=['authorid','about','website','twitter','original_hometown','latitude','longitude'])

#Drop the entries not in English
# for j in range(len(df)):
#     for letters in df['name'][j]:
#         if letters.isalpha() == False:
#             df = df.drop([j])
# for j in range(len(df)):
#     letters_list = list(df.iloc[j,0])
#     for letter in letters_list:
#         if letter.isalpha()==False :
#             df = df.drop([j])
        

# print("There are {} rows and {} columns in the dataset.".format(df_authors.shape[0], df_authors.shape[1]))
In [48]:
#overview of the dataframe
df.isnull().sum()
Out[48]:
name                 0
workcount            0
fan_count            0
gender               0
image_url            0
born            178287
died            197029
influence       201635
average_rate         0
rating_count         0
review_count         0
genre           135534
country         164918
dtype: int64
In [49]:
df = df[df['born'].notna()]
In [50]:
df = df[df['country'].notna()]
In [51]:
df = df[df['genre'].notna()]
df.head()
Out[51]:
name workcount fan_count gender image_url born died influence average_rate rating_count review_count genre country
13 John "Red" Shea 4 3 male https://images.gr-assets.com/authors/131048078... 1965-08-12 NaN NaN 3.55 514 56 biographies and memoirs United States
15 Ricardo Ferrari 31 3 male https://images.gr-assets.com/authors/135516310... 1957-01-18 NaN NaN 3.68 345 31 comics Argentina
45 محمد علي البار 54 37 male https://images.gr-assets.com/authors/130966185... 1939-12-29 NaN NaN 3.65 230 36 religion and spirituality,science,spirituality Yemen
51 Keith Roberts 118 24 male https://s.gr-assets.com/assets/nophoto/user/m_... 1935-09-20 2000-10-05 NaN 3.69 5402 552 fantasy,fiction United Kingdom
54 Shuho Sato 68 1 male https://images.gr-assets.com/authors/157798730... 1973-12-08 NaN NaN 3.82 724 27 comics,graphic novels,manga Japan
In [52]:
df = df.reset_index(drop=True)
In [53]:
df = df.sort_values(by=['country'])
df = df.reset_index(drop=True)
df_country_genre = df[['country','genre']]
df_country_genre.head()
Out[53]:
country genre
0 Afghanistan religion and spirituality,spirituality
1 Afghanistan children,romance
2 Afghanistan fiction,literature
3 Afghanistan fiction,literature,poetry,political
4 Albania drama,non fiction,poetry
Plotting span of genres for different countries
In [54]:
from collections import Counter
def split_genre(df_c):
    """
    given the dictionary, plots countries span of genres
    """
    assert isinstance(df_c, pd.core.frame.DataFrame)
    b = DataFrame(df_c.genre.str.split(',').tolist(), index=df_c.country).stack()
    b = b.reset_index()[[0, 'country']] # var1 variable is currently labeled 0
    b.columns = ['genre', 'country'] # renaming var
    
    b = collections.Counter(b.genre).most_common(8)
    # df_counts_top
    b = pd.DataFrame(b, columns = ['genre','count'])
#     b = b.sort_values(by = ['count'])
    b['count'] = b['count']/max(b['count'])
    b['genre'] = b.genre.str.title()
    
    # Initialing the spiderplot by  
    # setting figure size and polar 
    # projection 
    df_counts_top = b#.sample(frac=1).reset_index(drop=True)
    plt.figure(figsize =(15, 10)) 
    plt.subplot(polar = True) 

    theta = np.linspace(0, 2 * np.pi, len(df_counts_top)+1) 

    # Arranging the grid into number  
    # of sales into equal parts in 
    # degrees 
    lines, labels = plt.thetagrids(range(0, 360, int(360/len(df_counts_top['genre']))), 
                                                             list(df_counts_top['genre'])) 

    # Plot actual sales graph 
    plt.plot(theta, list(df_counts_top['count'])+[df_counts_top['count'][0]], color='#1aaf6c') 
    plt.fill(theta, list(df_counts_top['count'])+[df_counts_top['count'][0]], color='#1aaf6c', alpha=0.25) 
    plt.yticks(color='grey', size=25)
    plt.xticks(color='black', size=30)

    # # Add legend and title for the plot 
#     plt.title(country, size = 30) 
    plt.text(4.73,0.45,country,color = 'grey',  weight='medium', size=50, horizontalalignment='center', verticalalignment='top')
    plt.savefig(country + '.png')

    # # Dsiplay the plot on the screen 
#     plt.show() 
    
    return plt.show()
In [55]:
country = 'United States'
df_country = df_country_genre[df_country_genre['country']==country]
df_counts_top = split_genre(df_country)


country = 'France'
df_country = df_country_genre[df_country_genre['country']==country]
df_counts_top = split_genre(df_country)

country = 'India'
df_country = df_country_genre[df_country_genre['country']==country]
df_counts_top = split_genre(df_country)

country = 'China'
df_country = df_country_genre[df_country_genre['country']==country]
df_counts_top = split_genre(df_country)

country = 'Saudi Arabia'
df_country = df_country_genre[df_country_genre['country']==country]
df_counts_top = split_genre(df_country)

country = 'Japan'
df_country = df_country_genre[df_country_genre['country']==country]
df_counts_top = split_genre(df_country)
importing relavant libraries
In [56]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import matplotlib.animation as animation
import csv
from pandas import DataFrame
import seaborn as sns
from IPython.display import HTML
import collections

#For map
#conda install basemap
In [57]:
#error_bad_lines : boolean, default True Lines with too many fields (e.g. a csv line with too many commas) 
#will by default cause an exception to be raised, and no DataFrame will be returned. If False, then these 
#“bad lines” will dropped from the DataFrame that is returned. (Only valid with C parser)

df = pd.read_csv('final_dataset.csv',error_bad_lines = False) 

df_map = df.drop(columns=['authorid','about','website','twitter','original_hometown'])
#Drop the unwanted columns in the dataset
df = df.drop(columns=['authorid','about','website','twitter','original_hometown','latitude','longitude'])

#Drop the entries not in English
# for j in range(len(df)):
#     for letters in df['name'][j]:
#         if letters.isalpha() == False:
#             df = df.drop([j])
# for j in range(len(df)):
#     letters_list = list(df.iloc[j,0])
#     for letter in letters_list:
#         if letter.isalpha()==False :
#             df = df.drop([j])
        

# print("There are {} rows and {} columns in the dataset.".format(df_authors.shape[0], df_authors.shape[1]))
In [58]:
#overview of the dataframe
df.isnull().sum()
Out[58]:
name                 0
workcount            0
fan_count            0
gender               0
image_url            0
born            178287
died            197029
influence       201635
average_rate         0
rating_count         0
review_count         0
genre           135534
country         164918
dtype: int64
In [59]:
df = df[df['born'].notna()]
In [60]:
df = df[df['country'].notna()]
In [61]:
df = df[df['genre'].notna()]
In [62]:
df = df.reset_index(drop=True)
In [63]:
for i in range(len(df)):
    df.loc[i,'born'] = df.loc[i,'born'][0:4]
In [64]:
df = df.sort_values(by=['born'])
df = df.reset_index(drop=True)
df_born = df[['born','country']]
df_born['born'] = df_born.born.astype(int)
df_born.head()
<ipython-input-64-d0520d2956ca>:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_born['born'] = df_born.born.astype(int)
Out[64]:
born country
0 1680 Pakistan
1 1684 Norway
2 1685 United Kingdom
3 1685 Germany
4 1688 United Kingdom
In [65]:
year_0 = 1680
i = 0
for i in range(len(list(df_born['born']))):
    if ((int(df_born['born'][i]) >= year_0) and (int(df_born['born'][i]) < year_0 + 10)):
        df_born['born'][i] = year_0
    if int(df_born['born'][i]) >= (year_0 + 10):
        year_0 = year_0 + 10
        df_born['born'][i] = year_0
df_born
<ipython-input-65-d67a8f527f79>:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_born['born'][i] = year_0
/Users/zhenweiliu/opt/anaconda3/lib/python3.8/site-packages/IPython/core/interactiveshell.py:3418: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exec(code_obj, self.user_global_ns, self.user_ns)
<ipython-input-65-d67a8f527f79>:8: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_born['born'][i] = year_0
Out[65]:
born country
0 1680 Pakistan
1 1680 Norway
2 1680 United Kingdom
3 1680 Germany
4 1680 United Kingdom
... ... ...
17701 2000 Cyprus
17702 2000 United States
17703 2000 Sweden
17704 2010 Panama
17705 2010 United States

17706 rows × 2 columns

In [66]:
df_list = df_born.groupby('born')['country'].apply(list).to_dict()
In [67]:
# import random

# list_co = df_born.country.unique()
# list_colors = []
# for i in range(len(list_co)):
    
#     r = random.random()
#     b = random.random()
#     g = random.random()
#     color = (r, g, b)
    
#     if color not in list_colors:
#         list_colors.append(color)    
    

# colors = dict(zip(list_co,list_colors))
# colors['Japan']
In [68]:
# Japan Highlighted

list_co = df_born.country.unique()
list_colors = []
for i in range(len(list_co)):
    
    color = 'grey'
    if list_co[i] == 'Japan':
#         color = (0.9,0.9,0.9)
        list_colors.append('r')  
    else:
        list_colors.append(color)    
    

colors = dict(zip(list_co,list_colors))
In [69]:
df_list_year = collections.Counter(df_list.get(1800))
df_list_year['United States'] =df_list_year['United States']/5
df_list_year['United Kingdom'] =df_list_year['United Kingdom']/3

dff = pd.DataFrame.from_dict(df_list_year.items())
dff[0]
Out[69]:
0            Germany
1     United Kingdom
2            Hungary
3             France
4             Poland
5            Austria
6      United States
7            Finland
8             Canada
9             Russia
10             Italy
11           Denmark
12            Serbia
13       Switzerland
14           Ireland
15             Spain
16          Slovakia
Name: 0, dtype: object
In [70]:
fig, ax = plt.subplots(figsize=(15, 8))
def draw_barchart(year):
    """
    given a year, plots authors and countries barplot for that
    """
    assert isinstance(year, int)
    assert year > 0 and year <=2021
    df_list_year = collections.Counter(df_list.get(year))
    df_list_year['United States'] =df_list_year['United States']/5
    df_list_year['United Kingdom'] =df_list_year['United Kingdom']/3

    dff = pd.DataFrame.from_dict(df_list_year.items())
    dff = dff.sort_values(by=[1]).tail(10)
#     dff = dff[dff.n ['United States','United Kingdom','France','Germany','Poland']]
    
    
    # pass colors valueso `color=`
    ax.clear()
    ax.barh(dff[0], dff[1], color=[colors[str(x)] for x in dff[0]])

    dx = dff[1].max() / 200
    for i, (value, name) in enumerate(zip(dff[1], dff[0])):
        ax.text(value+dx, i-0.17,     name,           size=14, weight=600, ha='left', va='bottom')
        ax.text(value-dx, i,     f'{value:,.0f}K',  size=14, weight = 700, ha='right', color = 'white', va='center')
    # ... polished styles
    ax.text(1, 0.4, year, transform=ax.transAxes, color='#777777', size=46, ha='right', weight=800)
    ax.text(0, 1.06, 'Number of Authors (Thousands)', transform=ax.transAxes, size=12, color='#777777')
    ax.xaxis.set_major_formatter(ticker.StrMethodFormatter('{x:,.0f}'))
    ax.xaxis.set_ticks_position('top')
    ax.tick_params(axis='x', colors='#777777', labelsize=12)
    ax.set_yticks([])
    ax.margins(0, 0.01)
    ax.grid(which='major', axis='x', linestyle='-')
    ax.set_axisbelow(True)
    ax.text(0, 1.12, 'Authors & their Countires from 1800 to 1970',
            transform=ax.transAxes, size=24, weight=600, ha='left')
    ax.set
    plt.box(False)
    
draw_barchart(1800)
In [71]:
import matplotlib.animation as animation
from IPython.display import HTML
fig, ax = plt.subplots(figsize=(15, 8))
animator = animation.FuncAnimation(fig, draw_barchart, frames=range(1940, 1990, 10))
HTML(animator.to_jshtml()) 

# or use animator.to_html5_video() or animator.save()
Out[71]:
In [72]:
df = pd.read_csv('final_dataset.csv',error_bad_lines = False) 
df_map = df.drop(columns=['authorid','about','website','twitter','original_hometown'])
df_map = df_map.drop(columns=['workcount','fan_count','gender','image_url','born','died',
                              'influence','average_rate','rating_count',
                              'review_count','genre','country'])
df_map = df_map[df_map['latitude'].notna()]
df_map = df_map.reset_index(drop=True)
x = df_map['latitude']
y = df_map['longitude']
In [73]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.basemap import Basemap
In [74]:
from itertools import chain

def draw_map(m, scale=0.2):
    # draw a shaded-relief image
    m.shadedrelief(scale=scale)
    
    # lats and longs are returned as a dictionary
    lats = m.drawparallels(np.linspace(-90, 90, 13))
    lons = m.drawmeridians(np.linspace(-180, 180, 13))

    # keys contain the plt.Line2D instances
    lat_lines = chain(*(tup[1][0] for tup in lats.items()))
    lon_lines = chain(*(tup[1][0] for tup in lons.items()))
    all_lines = chain(lat_lines, lon_lines)
    
    # cycle through these lines and set the desired style
    for line in all_lines:
        line.set(linestyle='-', alpha=0.3, color='w')
In [75]:
fig = plt.figure(figsize=(18, 16), edgecolor='w')
m = Basemap(projection='cyl', resolution=None,
            llcrnrlat=-90, urcrnrlat=90,
            llcrnrlon=-180, urcrnrlon=180, )
x, y = m(df_map['longitude'],df_map['latitude'])
plt.plot(x, y, 'ok', markersize=1)
 
draw_map(m)

PART 2: BOOKS_DATA

Dataset Name: book_data.csv

Description: dataset of 54301 books

includes columns:
1. book_authors
2. book_desc
3. book_edition
4. book_format
5. book_isbn
6. book_pages
7. book_rating
8. book_rating_count
9. book_review_count
10. book_title
11. genres (separated with '|')
12. image_url

Load Libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import csv
import seaborn as sns
import statistics as stat
import collections
from numpy.random import rand
from matplotlib.colors import ListedColormap
my_cmap = ListedColormap(sns.color_palette('YlOrBr'))
my_cmap2 = ListedColormap(sns.color_palette('Blues_d'))
from matplotlib.colors import Normalize

Data Preprocessing

{printing attributes, removing null values, drop 'book_isbn'}

In [3]:
books = pd.read_csv('book_data.csv',error_bad_lines = False) 
type(books) # books is a DataFrame
print("There are {} rows and {} columns in the dataset.".format(books.shape[0], books.shape[1]))
attributes = np.array(books.columns)
print(attributes)
There are 54301 rows and 12 columns in the dataset.
['book_authors' 'book_desc' 'book_edition' 'book_format' 'book_isbn'
 'book_pages' 'book_rating' 'book_rating_count' 'book_review_count'
 'book_title' 'genres' 'image_url']
In [4]:
books.head()
Out[4]:
book_authors book_desc book_edition book_format book_isbn book_pages book_rating book_rating_count book_review_count book_title genres image_url
0 Suzanne Collins Winning will make you famous. Losing means cer... NaN Hardcover 9.78044E+12 374 pages 4.33 5519135 160706 The Hunger Games Young Adult|Fiction|Science Fiction|Dystopia|F... https://images.gr-assets.com/books/1447303603l...
1 J.K. Rowling|Mary GrandPré There is a door at the end of a silent corrido... US Edition Paperback 9.78044E+12 870 pages 4.48 2041594 33264 Harry Potter and the Order of the Phoenix Fantasy|Young Adult|Fiction https://images.gr-assets.com/books/1255614970l...
2 Harper Lee The unforgettable novel of a childhood in a sl... 50th Anniversary Paperback 9.78006E+12 324 pages 4.27 3745197 79450 To Kill a Mockingbird Classics|Fiction|Historical|Historical Fiction... https://images.gr-assets.com/books/1361975680l...
3 Jane Austen|Anna Quindlen|Mrs. Oliphant|George... «È cosa ormai risaputa che a uno scapolo in po... Modern Library Classics, USA / CAN Paperback 9.78068E+12 279 pages 4.25 2453620 54322 Pride and Prejudice Classics|Fiction|Romance https://images.gr-assets.com/books/1320399351l...
4 Stephenie Meyer About three things I was absolutely positive.F... NaN Paperback 9.78032E+12 498 pages 3.58 4281268 97991 Twilight Young Adult|Fantasy|Romance|Paranormal|Vampire... https://images.gr-assets.com/books/1361039443l...
In [5]:
#columns which contain null values and the number of null elements
null_counts = books.isnull().sum()
null_counts[null_counts > 0].sort_values(ascending=False) #null_counts in each column (sorted)
Out[5]:
book_edition    48848
book_isbn       12866
genres           3242
book_pages       2522
book_format      1656
book_desc        1331
image_url         683
dtype: int64
In [6]:
# gives percentages of missing/null data for each column
null_percentages = null_counts/books.shape[0]
null_percentages[null_percentages > 0].sort_values(ascending=False)
Out[6]:
book_edition    0.899578
book_isbn       0.236939
genres          0.059704
book_pages      0.046445
book_format     0.030497
book_desc       0.024512
image_url       0.012578
dtype: float64
In [7]:
#columns
np.array(books.columns)
Out[7]:
array(['book_authors', 'book_desc', 'book_edition', 'book_format',
       'book_isbn', 'book_pages', 'book_rating', 'book_rating_count',
       'book_review_count', 'book_title', 'genres', 'image_url'],
      dtype=object)
In [8]:
# removing book_isbn column from dataframe 'books'
books.drop(labels='book_isbn', axis=1, index=None, columns=None, level=None, inplace=False, errors='raise')
Out[8]:
book_authors book_desc book_edition book_format book_pages book_rating book_rating_count book_review_count book_title genres image_url
0 Suzanne Collins Winning will make you famous. Losing means cer... NaN Hardcover 374 pages 4.33 5519135 160706 The Hunger Games Young Adult|Fiction|Science Fiction|Dystopia|F... https://images.gr-assets.com/books/1447303603l...
1 J.K. Rowling|Mary GrandPré There is a door at the end of a silent corrido... US Edition Paperback 870 pages 4.48 2041594 33264 Harry Potter and the Order of the Phoenix Fantasy|Young Adult|Fiction https://images.gr-assets.com/books/1255614970l...
2 Harper Lee The unforgettable novel of a childhood in a sl... 50th Anniversary Paperback 324 pages 4.27 3745197 79450 To Kill a Mockingbird Classics|Fiction|Historical|Historical Fiction... https://images.gr-assets.com/books/1361975680l...
3 Jane Austen|Anna Quindlen|Mrs. Oliphant|George... «È cosa ormai risaputa che a uno scapolo in po... Modern Library Classics, USA / CAN Paperback 279 pages 4.25 2453620 54322 Pride and Prejudice Classics|Fiction|Romance https://images.gr-assets.com/books/1320399351l...
4 Stephenie Meyer About three things I was absolutely positive.F... NaN Paperback 498 pages 3.58 4281268 97991 Twilight Young Adult|Fantasy|Romance|Paranormal|Vampire... https://images.gr-assets.com/books/1361039443l...
... ... ... ... ... ... ... ... ... ... ... ...
54296 Howard Megdal In this fearless and half-crazy story, Howard ... NaN Hardcover 256 pages 3.37 27 9 Taking the Field: A Fan's Quest to Run the Tea... Sports|Baseball|Sports and Games|Sports|Nonfic... https://images.gr-assets.com/books/1312074392l...
54297 Howard Megdal From the icons of the game to the players who ... NaN Hardcover 256 pages 3.97 34 5 The Baseball Talmud: Koufax, Greenberg, and th... Nonfiction|Sports and Games|Sports https://images.gr-assets.com/books/1348841629l...
54298 Howard Megdal NaN NaN Kindle Edition NaN 3.66 32 3 Wilpon's Folly - The Story of a Man, His Fortu... Sports|Baseball|Abandoned https://images.gr-assets.com/books/1394277097l...
54299 Mimi Baird|Eve Claxton Soon to be a major motion picture, from Brad P... NaN Hardcover 272 pages 3.82 867 187 He Wanted the Moon: The Madness and Medical Ge... Nonfiction|Autobiography|Memoir|Biography|Psyc... https://images.gr-assets.com/books/1403192135l...
54300 Leah Price The Anthology and the Rise of the Novel brings... NaN Paperback 236 pages 3.58 12 3 The Anthology and the Rise of the Novel: From ... Criticism|Literary Criticism|Philosophy|Theory... https://images.gr-assets.com/books/1349014225l...

54301 rows × 11 columns

In [9]:
print("Number of unique Book Titles: ", len(np.unique(books['book_title']))) #unique elements in the column
print("Not unique: ", books.shape[0]- len(np.unique(books['book_title']))) #not unique elements in this column
#print("Book Titles", np.unique(books['book_title'])) #unique elements in this column
Number of unique Book Titles:  48483
Not unique:  5818
In [10]:
print("Number of unique Authors: ", len(np.unique(books['book_authors']))) #unique elements in the column
print("Not unique: ", books.shape[0]- len(np.unique(books['book_authors']))) #not unique elements in this column
#print("Book Authors", np.unique(books['book_authors'])) #unique elements in this column
Number of unique Authors:  27159
Not unique:  27142
Getting top 10 Genres
In [11]:
# parse through genres to idenify unique categories
genre_arr = np.array(books['genres'])
#print(genre_arr)
unique_genres = []
ave_genres_per_book = 0
total_num_books = 0

for indx in genre_arr:
    string = str(indx)
    temp = string.split("|")
    ave_genres_per_book += len(temp)
    total_num_books += 1
    for i in temp:
        if i not in unique_genres:
            unique_genres.append(i)

print("Ave number of Genres per Book: ", str(ave_genres_per_book/total_num_books))
#print("Number of unique genres: ", str(len(unique_genres)))  
Ave number of Genres per Book:  5.582364965654408
In [12]:
# make dictionary where the keys are the unique genres and the values are the count of books that satisfy that genre
genres_dict = dict.fromkeys(unique_genres, 0)
# find book count that falls under every unique genres
for indx in genre_arr:
    string = str(indx)
    temp = tuple(string.split("|"))
    for gen in temp:
        genres_dict[gen] += 1
# sort genres dict
sorted_dict = {}
sorted_keys = sorted(genres_dict, key=genres_dict.get)
for k in sorted_keys:
    sorted_dict[k] = genres_dict[k]
In [13]:
# get top 10 genres only for plotting bar graph
top_ten_genres = list(sorted_dict)[-11:-1]
final_genres_dict = {}
for i in top_ten_genres:
    final_genres_dict[i] =(sorted_dict[i])
keys = final_genres_dict.keys()
values = final_genres_dict.values()
values_list = list(values)  
print(final_genres_dict)
{'Classics': 6379, 'Historical Fiction': 6448, 'Science Fiction': 6821, 'Nonfiction': 7727, 'Mystery': 7960, 'Paranormal': 8018, 'Historical': 10868, 'Young Adult': 11320, 'Romance': 18732, 'Fantasy': 23722}
In [14]:
x1 = list(keys)
y1 = list(values)
# make horizontal bar plot to be more consistent style with other bar plots
barlist = plt.barh(x1, y1)
# use custom color palette to match with genres vs ratings vs frequency bubble plot
colors = ["Crimson", "LightSalmon", "#FFFF00", "#663399", "Orange", "#90EE90", "#808000", "#1E90FF", "#0000FF", "#DAA520", "#228B22", "red", "#3CB371"]
barlist[0].set_color(colors[1])
barlist[1].set_color(colors[2])
barlist[2].set_color(colors[3])
barlist[3].set_color(colors[4])
barlist[4].set_color(colors[5])
barlist[5].set_color(colors[6])
barlist[6].set_color(colors[7])
barlist[7].set_color(colors[8])
barlist[8].set_color(colors[9])
barlist[9].set_color(colors[10])
plt.title('Top 10 Book Genres')
plt.xlabel('Number of Books')
plt.savefig('top_10_genres.png', dpi=300)
plt.show()
All books rating distribution
In [15]:
# book ratings distribution
sns.displot(books, x='book_rating',binwidth = 0.05, color = 'salmon')
plt.axvline(x= np.mean(books['book_rating']), color="blue", label="mean")
plt.title('Distribution of All Books\' Ratings' )
plt.legend(loc="upper left")
plt.xlim(3,5)
plt.show()

Query 3: Genres Spanned Distrbution and vs Ratings

In [16]:
# BOOK RATINGS VS GENRES SPANNED
# QUERY 2: Books ratings vs # genres spanned 
# (do people like books that are about a variety of themes/topics or just a few?)
# dict key some span of rating (ie. 4 - 4.25, 4.25 - 4.5, etc.)
# value is # of genres they span
genre_span_list = np.zeros(len(books['genres']))
for x in books['genres'].index:
        num_genres = str(books['genres'][x]).count('|') + 1
        genre_span_list[x] = num_genres

#print(len(genre_span_list))
#print(genre_span_list)

genre_span_dict = {}
for x in range(len(genre_span_list)):
    if (genre_span_list[x] not in genre_span_dict):
        genre_span_dict[genre_span_list[x]] = 1
    else:
        genre_span_dict[genre_span_list[x]] += 1

ordered_genre_span_dict = collections.OrderedDict(sorted(genre_span_dict.items()))
x = ordered_genre_span_dict.keys()
y = ordered_genre_span_dict.values()
plt.figure(figsize=(6, 4))
plt.bar(x, y, color='salmon', edgecolor="black", width=1)
plt.title('Distribution of Number of Genres Spanned')
plt.xlabel('Number of Genres')
plt.ylabel('Number of Books')
plt.axvline(x= ave_genres_per_book/total_num_books, color="blue", label="mean")
plt.legend(loc="upper left")
plt.xticks(range(18))
plt.xlim(1,18)
Out[16]:
(1.0, 18.0)
In [17]:
# Add column to books dataframe for rate group (Poor, Bad, Decent, Good, Extremely Good) for plotting queries against
books['rate_group'] = pd.cut(books['book_rating'],bins=[0,3,3.5,4,4.5,5], labels=['Poor (0-3)','Bad (3-3.5)','Decent (3.5-4)', 'Good (4-4.5)','Extremely Good (4.5-5)'])
books.head()
Out[17]:
book_authors book_desc book_edition book_format book_isbn book_pages book_rating book_rating_count book_review_count book_title genres image_url rate_group
0 Suzanne Collins Winning will make you famous. Losing means cer... NaN Hardcover 9.78044E+12 374 pages 4.33 5519135 160706 The Hunger Games Young Adult|Fiction|Science Fiction|Dystopia|F... https://images.gr-assets.com/books/1447303603l... Good (4-4.5)
1 J.K. Rowling|Mary GrandPré There is a door at the end of a silent corrido... US Edition Paperback 9.78044E+12 870 pages 4.48 2041594 33264 Harry Potter and the Order of the Phoenix Fantasy|Young Adult|Fiction https://images.gr-assets.com/books/1255614970l... Good (4-4.5)
2 Harper Lee The unforgettable novel of a childhood in a sl... 50th Anniversary Paperback 9.78006E+12 324 pages 4.27 3745197 79450 To Kill a Mockingbird Classics|Fiction|Historical|Historical Fiction... https://images.gr-assets.com/books/1361975680l... Good (4-4.5)
3 Jane Austen|Anna Quindlen|Mrs. Oliphant|George... «È cosa ormai risaputa che a uno scapolo in po... Modern Library Classics, USA / CAN Paperback 9.78068E+12 279 pages 4.25 2453620 54322 Pride and Prejudice Classics|Fiction|Romance https://images.gr-assets.com/books/1320399351l... Good (4-4.5)
4 Stephenie Meyer About three things I was absolutely positive.F... NaN Paperback 9.78032E+12 498 pages 3.58 4281268 97991 Twilight Young Adult|Fantasy|Romance|Paranormal|Vampire... https://images.gr-assets.com/books/1361039443l... Decent (3.5-4)
In [18]:
genre_span_arr = np.array(genre_span_list)
books = books.sort_index(axis=0,ascending=True)
books['genres_spanned'] = genre_span_arr
#books.head()
In [19]:
d = books.groupby(['genres_spanned', 'rate_group'])['genres'].size().unstack()
d.plot(kind='bar', stacked=True, title = 'Ratings of Books vs Number of Genres Spanned', colormap=my_cmap)
plt.xlabel('Number of Genres Spanned')
plt.ylabel('Number of Books')
plt.tight_layout()
plt.savefig('genre_span_rating.png', dpi=300)

Query 4: Book Format vs Rating

In [20]:
# QUERY: BOOK FORMAT VS RATINGS
book_form = np.array(books['book_format'])
book_form_dict = {}
for x in book_form:
    if x not in book_form_dict:
        book_form_dict[x] = 1
    elif x in book_form_dict:
        book_form_dict[x] += 1
#print(book_form_dict)
# get top 10 book formats
sorted_form_dict = {}
sorted_keys2 = sorted(book_form_dict, key=book_form_dict.get)
for k in sorted_keys2:
    sorted_form_dict[k] = book_form_dict[k]
#print(sorted_form_dict)
top_ten_formats = list(sorted_form_dict)[-1]
final_form_dict = {}

final_form_dict = {'ebook': 2534, 'Mass Market\nPaperback': 2668, 'Kindle Edition': 5436, 'Hardcover': 12163, 'Paperback': 28725}
keys2 = final_form_dict.keys()
values2 = final_form_dict.values()

x = list(keys2)
y = list(values2)
barlist2= plt.barh(x, y,color='#f4811d')
color_arr = ['#fff4b6', '#feda7e', '#feb23f', '#f4811d', '#d55607', '#a03704']
barlist2[0].set_color(color_arr[1])
barlist2[1].set_color(color_arr[2])
barlist2[2].set_color(color_arr[3])
barlist2[3].set_color(color_arr[4])
barlist2[4].set_color(color_arr[5])

plt.title('Top 5 Book Formats')
plt.xlabel('Number of Books')
for i in range(my_cmap.N):
    rgba = my_cmap(i)

# BOOK RATINGS VS BOOK FORMAT
book_format_list = np.zeros(len(books['book_format'])) # keys are 
paperback_count = 0
hardcover_count = 0
kindle_count = 0
mmp_count = 0
ebook_count = 0
other_count = 0
book_form_list = []

#books = books.sort_index(axis=0,ascending=True)
#books['genres_spanned'] = genre_span_arr
for x in books['book_format'].index:
    if books['book_format'][x]== 'Paperback':
        paperback_count+=1
        book_form_list.append(1)
    elif books['book_format'][x]== 'Hardcover':
        hardcover_count+=1
        book_form_list.append(2)
    elif books['book_format'][x]== 'Kindle Edition':
        kindle_count+=1
        book_form_list.append(3)
    elif books['book_format'][x]== 'Mass Market Paperback':
        mmp_count+=1
        book_form_list.append(4)
    elif books['book_format'][x]== 'ebook':
        ebook_count+=1
        book_form_list.append(5)
    else:
        book_form_list.append(0)
        other_count+=1

book_form_arr = np.array(book_form_list)
books['books_format'] = book_form_arr
#books.head()
sss = books.groupby(['books_format', 'rate_group'])['genres'].size().unstack()
ax = sss.plot(kind='bar', stacked=True, title = 'Ratings of Books vs Top 5 Book Formats',colormap=my_cmap)
plt.legend(loc="upper right")
plt.tight_layout()
plt.xlim(0.5, 5.5)
ax.set_xticks([1, 2, 3, 4, 5])
ax.set_xticklabels(['Paperback', 'Hardcover', 'Kindle Edition', 'Mass Market\n Paperback', 'ebook'], rotation=0)
ax.set_xlabel('Book Formats\n')
plt.savefig('book_format_vs_rating3.png', dpi=300)

Query 5: Title Word Length vs Rating

In [21]:
# BOOK RATINGS VS NUMBER OF LETTERS IN TITLE
title_len_list = np.zeros(len(books['book_title']))
for x in books['book_title'].index:
        num_title_words = int(len(str(books['book_title'][x]).split()))
        title_len_list[x] = num_title_words
#print(title_len_list)
title_len_arr = np.array(title_len_list)
books['title_word_len'] = title_len_arr
#books.head()
In [22]:
ss = books.groupby(['title_word_len', 'rate_group'])['genres'].size().unstack()
ss.plot(kind='bar', stacked=True, title = 'Ratings of Books vs Number of Words in Title',colormap=my_cmap)
plt.axvline(x= np.mean(books['title_word_len']), color="navy", label="mean")
plt.legend(loc="upper right")
plt.xlim(-0.5, 14.5)
plt.xlabel('Number of Words in Book Title')
plt.savefig('book_word_len_vs_rating.png', dpi=300)
In [23]:
# add book pages column count in dataframe, for correlation matrix
book_pages = books['book_pages'].str.rstrip('pages ')
book_pages = book_pages.dropna()
book_pages = book_pages.astype(int, copy=True, errors='raise')
#book_pages = book_pages.sort_values(False)
#print(book_pages)
books['book_pgs'] = book_pages

Correlation Matrix

In [24]:
# Correlation Matrix
df_core = pd.concat([books['book_rating'], books['book_pgs'], books['book_rating_count'],books['book_review_count'],books['genres_spanned'], books['title_word_len'], books['books_format']], axis=1, keys=['book_rating', 'book_pages', 'rate_count', 'rev_count', 'genres_spanned', 'title_word_len','books_format'])
#print(df_core)
#df_core.corr()
corrMatrix2 = df_core.corr()
#print (corrMatrix2)
sns.heatmap(corrMatrix2, annot=True)
plt.tight_layout()
plt.savefig('correlation_matrix.png', dpi=300)
plt.show()

Genre and Title Analysis

Importing Libraries
In [1]:
import pandas as pd #for importing csv file
import numpy as np #for sum mathematical stuff
import matplotlib.pyplot as plt #for plotting
import nltk  
import numpy as np  
import random  
import string
import bs4 as bs  
import urllib.request  
import re  
import time
import string
import collections
from collections import Counter
from wordcloud import WordCloud
from wordcloud import WordCloud, ImageColorGenerator
from PIL import Image
import matplotlib.pyplot as plt
import seaborn as sns
import nltk  
import numpy as np  
import random  
import string
import bs4 as bs  
import urllib.request  
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import ShuffleSplit
import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
Loading Datasets
In [2]:
books = pd.read_csv('book_data.csv',error_bad_lines = False) 
#error_bad_lines : boolean, default True Lines with too many fields (e.g. a csv line with too many commas) will by default cause an exception to be raised, and no DataFrame will be returned. If False, then these “bad lines” will dropped from the DataFrame that is returned. (Only valid with C parser)
Exploring the dataset content
In [3]:
print("There are {} rows and {} columns in the dataset.".format(books.shape[0], books.shape[1]))
There are 54301 rows and 12 columns in the dataset.
In [4]:
#books.shape #table dimensions
Dataset columns
In [6]:
#columns
np.array(books.columns)
Out[6]:
array(['book_authors', 'book_desc', 'book_edition', 'book_format',
       'book_isbn', 'book_pages', 'book_rating', 'book_rating_count',
       'book_review_count', 'book_title', 'genres', 'image_url'],
      dtype=object)

Columns are 'book_authors', 'book_desc', 'book_edition', 'book_format', 'book_isbn', 'book_pages', 'book_rating', 'book_rating_count', 'book_review_count', 'book_title', 'genres', 'image_url'

Depicting the first 5 rows of this dataset with head method.

In [10]:
books.head()
Out[10]:
book_authors book_desc book_edition book_format book_isbn book_pages book_rating book_rating_count book_review_count book_title genres image_url
0 Suzanne Collins Winning will make you famous. Losing means cer... NaN Hardcover 9.78044E+12 374 pages 4.33 5519135 160706 The Hunger Games Young Adult|Fiction|Science Fiction|Dystopia|F... https://images.gr-assets.com/books/1447303603l...
1 J.K. Rowling|Mary GrandPré There is a door at the end of a silent corrido... US Edition Paperback 9.78044E+12 870 pages 4.48 2041594 33264 Harry Potter and the Order of the Phoenix Fantasy|Young Adult|Fiction https://images.gr-assets.com/books/1255614970l...
2 Harper Lee The unforgettable novel of a childhood in a sl... 50th Anniversary Paperback 9.78006E+12 324 pages 4.27 3745197 79450 To Kill a Mockingbird Classics|Fiction|Historical|Historical Fiction... https://images.gr-assets.com/books/1361975680l...
3 Jane Austen|Anna Quindlen|Mrs. Oliphant|George... «È cosa ormai risaputa che a uno scapolo in po... Modern Library Classics, USA / CAN Paperback 9.78068E+12 279 pages 4.25 2453620 54322 Pride and Prejudice Classics|Fiction|Romance https://images.gr-assets.com/books/1320399351l...
4 Stephenie Meyer About three things I was absolutely positive.F... NaN Paperback 9.78032E+12 498 pages 3.58 4281268 97991 Twilight Young Adult|Fantasy|Romance|Paranormal|Vampire... https://images.gr-assets.com/books/1361039443l...
Checking for null cells in the dataset
In [12]:
#columns which contain null values and the number of null elements
null_counts = books.isnull().sum()
null_counts[null_counts > 0].sort_values(ascending=False) #null_counts in each column (sorted)
Out[12]:
book_edition    48848
book_isbn       12866
genres           3242
book_pages       2522
book_format      1656
book_desc        1331
image_url         683
dtype: int64
ْQuerry: Most frequent words in title of a specific genre
In [14]:
#removing stopwords from the book description #future: use the initial form of words
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
#stop_words
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/fatemeh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
In [15]:
books[['book_title', 'genres']]
Out[15]:
book_title genres
0 The Hunger Games Young Adult|Fiction|Science Fiction|Dystopia|F...
1 Harry Potter and the Order of the Phoenix Fantasy|Young Adult|Fiction
2 To Kill a Mockingbird Classics|Fiction|Historical|Historical Fiction...
3 Pride and Prejudice Classics|Fiction|Romance
4 Twilight Young Adult|Fantasy|Romance|Paranormal|Vampire...
... ... ...
54296 Taking the Field: A Fan's Quest to Run the Tea... Sports|Baseball|Sports and Games|Sports|Nonfic...
54297 The Baseball Talmud: Koufax, Greenberg, and th... Nonfiction|Sports and Games|Sports
54298 Wilpon's Folly - The Story of a Man, His Fortu... Sports|Baseball|Abandoned
54299 He Wanted the Moon: The Madness and Medical Ge... Nonfiction|Autobiography|Memoir|Biography|Psyc...
54300 The Anthology and the Rise of the Novel: From ... Criticism|Literary Criticism|Philosophy|Theory...

54301 rows × 2 columns

In [16]:
desired_genres = ['Fiction', 'Classics', 'Sports', 'Romance'] #from another analysis in the following
In [17]:
#books['genres'].iloc[0], books['book_title'].iloc[0]
In [18]:
#books = books[books['book_title'].notna()] #removing nulls in book_title column
books = books[books['genres'].notna()] #removing nulls in genres column
In [19]:
s = time.time()
fiction_string = ""
classics_string = ""
sports_string = ""
romance_string = ""
for i in range(len(books)):
    genre = books['genres'].iloc[i]
    title = books['book_title'].iloc[i]
    if desired_genres[0] in genre:
        fiction_string+= str(title) + " "
    if desired_genres[1] in genre:
        classics_string+= str(title) + " "
    if desired_genres[2] in genre:
        sports_string+= str(title) + " "
    if desired_genres[3] in genre:
        romance_string+= str(title) + " "
print("Took {} seconds".format(time.time() -s ))
Took 0.7864232063293457 seconds
In [20]:
#function for removing non-english words
def isEnglish(s):
    """
    checks whether s is a string or not
    """
    assert isinstance(s, str)
    try:
        s.encode(encoding='utf-8').decode('ascii')
    except UnicodeDecodeError:
        return False
    else:
        return True
assert not isEnglish('کتاب')
In [21]:
fiction_string[:100]
Out[21]:
'The Hunger Games Harry Potter and the Order of the Phoenix To Kill a Mockingbird Pride and Prejudice'
In [22]:
import collections
from collections import Counter

stopwords_dict = Counter(stop_words)
def clean_words (string_g):
    """
    given a string, removes stopwords, non-english words, punctuations
    """
    #fiction_string = ' '.join([word for word in fiction_string.split() if word not in stopwords_dict])
    #fiction_string = ' '.join([word for word in fiction_string.split() if isEnglish(word)])
    string_g = "".join(l for l in string_g if l not in string.punctuation) #remove punctuation
    string_words = string_g.split()
    string_words = [string_words[i].lower() for i in range(len(string_words))] #lower
    #fiction_words = [fiction_words[i] if fiction_words[i] not in stopwords_dict] #not stop words
    words = []
    for i in range(len(string_words)): #english
        if string_words[i] not in stopwords_dict:
            if isEnglish(string_words[i]):
                if string_words[i] not in stop_words:
                    words.append(string_words[i])
                else:
                    pass
    word_freq = {}
    for i in words:
        if i not in word_freq and len(i)>2:
            word_freq[i] = 1
        elif len(i)>2 and i in word_freq:
            word_freq[i] +=1
    return word_freq
In [23]:
#returning most frequent words of a dictionary
def most_freq_in_dictionary(diction, top):
    """
    returning most frequent words of a dictionary
    """
    assert isinstance(diction, dict)
    assert isinstance(top, int)
    assert top > 0
    sorted_diction = {k: v for k, v in sorted(diction.items(), key=lambda item: item[1])}
    values = sorted(sorted_diction.values())
    wc = {}
    for k,v in sorted_diction.items():
        if v in values[-top:]:
            wc[k] = v
    #value, count = collections.Counter(diction.values()).most_common(top)
    return sorted_diction, values[-top:], wc
In [24]:
word_freq = clean_words(fiction_string)
d_fiction, v_fiction, wc_fiction = most_freq_in_dictionary(word_freq, 50)

word_freq = clean_words(classics_string)
d_classics, v_classics, wc_classics = most_freq_in_dictionary(word_freq, 50)

word_freq = clean_words(sports_string)
d_sports, v_sports, wc_sports = most_freq_in_dictionary(word_freq, 50)

word_freq = clean_words(romance_string)
d_romance, v_romance, wc_romance = most_freq_in_dictionary(word_freq, 50)
Plotting word clouds
In [28]:
#!conda install -c conda-forge wordcloud=1.6.0 
In [35]:
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
stopwords = set(STOPWORDS)

def show_wordcloud(data, title = None):
    """
    showing a string's words as a wordcloud
    """
    assert isinstance(data, str)
    wordcloud = WordCloud(
        background_color='white',
        stopwords=stopwords,
        max_words=200,
        max_font_size=40, 
        scale=10,
        random_state=1 # chosen at random by flipping a coin; it was heads
    ).generate(str(data))

    fig = plt.figure(1, figsize=(12, 12))
    plt.axis('off')
    if title: 
        fig.suptitle(title, fontsize=20)
        fig.subplots_adjust(top=2.3)

    plt.imshow(wordcloud)
    plt.show()
In [53]:
def plot_wc(diction, filename):
    """
    given a dictionary plots its wc
    """
    assert isinstance(diction, dict)
    assert isinstance(filename, str)
    #word_could_dict=Counter(g)
    custom_mask = np.array(Image.open("book.png"))
    wordcloud = WordCloud(background_color="white", 
                          #mode="RGBA",
                          #colormap='Dark2',
                          colormap='RdBu',
                          #colormap='BrBG',
                          collocations=False, 
                          mask=custom_mask, contour_width=1, 
                          contour_color='black',
                         width=1200, height=1000,
                          max_font_size=80, 
                          scale=3,
                         ).generate_from_frequencies(diction)
    #wc = WordCloud(background_color="white", mask=custom_mask)
    #wc = WordCloud(background_color="white", collocations=False, mask=custom_mask, contour_width=1, contour_color='gray')
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.savefig("{}.png".format(filename))
    plt.show()
#source: https://medium.com/swlh/masking-with-wordcloud-in-python-500-most-frequently-used-words-in-german-c0e865e911bb
In [39]:
plot_wc(wc_sports, "sport_wc")
In [43]:
plot_wc(wc_fiction, "fiction_wc")
In [48]:
plot_wc(wc_classics, "classics_wc")
In [56]:
plot_wc(wc_romance, "romance_wc")
Querry: Most frequent books counts vs ratings
In [57]:
g= [] #genres
for i in range(books.shape[0]):
    if isinstance(books['genres'].iloc[i], str) and len(books['genres'].iloc[i])>0:
        g.append(books['genres'].iloc[i])
In [58]:
#figuring out all the existing genres and saving the genres in Genres_list list
Genres_dict = {}
for i in range(len(books)):
    row = books.iloc[i]
    Gen = str(row.genres)
    genres = Gen.split('|')
    for genre in genres:
        if genre not in Genres_dict:
            Genres_dict[str(genre)] = []
            Genres_dict[str(genre)].append(books.iloc[i]['book_rating'])
        else:
            Genres_dict[str(genre)].append(books.iloc[i]['book_rating'])
#len(Genres_list)
In [59]:
#figuring out all the existing genres and saving the genres in Genres_list list
Genres_dict_count = {}
for i in range(len(books)):
    row = books.iloc[i]
    Gen = str(row.genres)
    genres = Gen.split('|')
    for genre in genres:
        if genre not in Genres_dict_count:
            Genres_dict_count[str(genre)] = 1
        else:
            Genres_dict_count[str(genre)] +=1
#len(Genres_list)
In [60]:
genres_average_rating = {}
for k, v in Genres_dict.items():
    if k not in genres_average_rating:
        genres_average_rating[k] = np.mean(v)
In [61]:
#also can be derived from our function in part 1
most_frequent_gens ={'Contemporary': 6039, 
 'Classics': 6272,
 'Historical Fiction': 6399,
 'Science Fiction': 6780,
 'Nonfiction': 7598,
 'Mystery': 7902,
 'Paranormal': 7994,
 'Historical': 10789,
 'Young Adult': 11251,
 'Romance': 18636,
 'Fantasy': 23583,
 'Fiction': 25736}
In [62]:
frequent_gs = list(most_frequent_gens.keys())
frequent_counts = list(most_frequent_gens.values())
frequent_rates = [genres_average_rating[i] for i in frequent_gs]
In [63]:
#### import random
number_of_colors =len(frequent_gs) #len
r = ["#"+''.join([random.choice('0123456789ABCDFE') for j in range(6)])
             for i in range(number_of_colors)]
random_colors = ["Crimson", "pink", "#FFFF00", "#663399","Orange", "#90EE90", "#808000", 
                "#1E90FF", "#0000FF", "#DAA520", "#228B22", "red", "#3CB371"]
plt.figure(figsize=(12, 9))
for i in range(len(frequent_counts)):
    plt.scatter(frequent_counts[i],frequent_rates[i],
                      c=random_colors[i],
                     alpha=0.5,
               s = frequent_counts[i], label=frequent_gs[i])
for i, j in zip(frequent_counts, frequent_rates):
    plt.text(i-300, j, s=str(np.round(j, 2)), c="navy", size=10)
    
plt.rcParams["legend.markerscale"] = 0.1
plt.legend(markerscale=0.07,bbox_to_anchor=(1, 1), loc='upper left', ncol=1)
plt.xlabel("Frequency of genres", size=14)
plt.ylabel("Average rating", size=14)
plt.title("Most frequent genres average ratings vs frequency", size=16)
plt.savefig("freq_genres_ratings_2.png")
Prediction of genre based on its title
In [64]:
s= [] #stephan king
for i in range(len(books)):
    if "Stephen King" in books.iloc[i]["book_authors"]:
        #print( books.iloc[i]["book_title"])
        s.append(books.iloc[i]["book_title"])
In [65]:
titles_genres = books[['book_title', 'genres']] #picking just title and genres
In [66]:
word_freq = clean_words(romance_string)
d_romance, v_romance, wc_romance = most_freq_in_dictionary(word_freq, 150)

word_freq = clean_words(sports_string)
d_sports, v_sports, wc_sports = most_freq_in_dictionary(word_freq, 100)

word_freq = clean_words(classics_string)
d_classics, v_classics, wc_classics = most_freq_in_dictionary(word_freq, 150)

r = list(wc_romance.keys())
s = list(wc_sports.keys())
c = list(wc_classics.keys())
In [67]:
word_vectors_rsc = r+s+c
word_vectors_rsc = list(np.unique(word_vectors_rsc))
In [68]:
books = books[books['genres'].notna()] 
In [70]:
numbers = [0]*3
r_ind = []
s_ind = []
c_ind = []
for i in range(len(books)):
    if 'Romance' in books.genres.iloc[i]:
        numbers[0]+=1
        r_ind.append(i)
    if 'Sport' in books.genres.iloc[i]:
        numbers[1]+=1
        s_ind.append(i)
    if 'Classics' in books.genres.iloc[i]:
        numbers[2]+=1
        c_ind.append(i)
indices = [r_ind, s_ind, c_ind]
indices = [item for sublist in indices for item in sublist]

cleaned_title = []
cleaned_title.append(clean_words(books.iloc[0]['book_title']))
cleaned_title

word_vec = np.zeros((len(indices), len(word_vectors_rsc)))
word_vec.shape

#converting titles to word vectors
for i in range(len(indices)):
    index = indices[i]
    cleaned_title = clean_words(books.iloc[index]['book_title'])
    for j in range(len(word_vectors_rsc)):
        if word_vectors_rsc[j] in cleaned_title.keys():
            #print(word_vectors_rsc[j])
            word_vec[i][j] = 1
            
y = []
for i in r_ind:
    y.append(0) #romance
for j in s_ind:
    y.append(1) #sport
for k in c_ind:
    y.append(2)
In [71]:
y.count(0), y.count(1), y.count(2), numbers
Out[71]:
(12910, 539, 6317, [12910, 539, 6317])
SVM
In [74]:
#ran on server
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
X = word_vec
y = y
iterations = 10 #ran 10 times in the main file
t = []
acc = []
for i in range(iterations):
    s = time.time()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    #print(len(X_train), len(X_test))
    clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
    clf.fit(X_train, y_train)
    predicted = clf.predict(X_test)
    #print("accuracy is ", accuracy_score(y_test, predicted))
    #print("took {} second".format(time.time()-s))
    acc.append(accuracy_score(y_test, predicted))
    t.append(time.time()-s)
In [75]:
s = time.time()
X = word_vec
y = y
t = []
acc = []
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
#print(len(X_train), len(X_test))
clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
clf.fit(X_train, y_train)
predicted = clf.predict(X_test)
#print("accuracy is ", accuracy_score(y_test, predicted))
#print("took {} second".format(time.time()-s))
acc.append(accuracy_score(y_test, predicted))
t.append(time.time()-s)
In [76]:
s = time.time()
X = word_vec
y = y
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
#print(len(X_train), len(X_test))
clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
clf.fit(X_train, y_train)
predicted = clf.predict(X_test)
#print("accuracy is ", accuracy_score(y_test, predicted))
#print("took {} second".format(time.time()-s))
acc.append(accuracy_score(y_test, predicted))
t.append(time.time()-s)
acc, t
Out[76]:
([0.7079258010118044, 0.712310286677909],
 [100.84012222290039, 100.6418559551239])
In [77]:
s = time.time()
X = word_vec
y = y
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
#print(len(X_train), len(X_test))
clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
clf.fit(X_train, y_train)
predicted = clf.predict(X_test)
#print("accuracy is ", accuracy_score(y_test, predicted))
#print("took {} second".format(time.time()-s))
acc.append(accuracy_score(y_test, predicted))
t.append(time.time()-s)
acc, t
Out[77]:
([0.7079258010118044, 0.712310286677909, 0.7003372681281619],
 [100.84012222290039, 100.6418559551239, 97.06852984428406])
In [78]:
s = time.time()
X = word_vec
y = y
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
#print(len(X_train), len(X_test))
clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
clf.fit(X_train, y_train)
predicted = clf.predict(X_test)
#print("accuracy is ", accuracy_score(y_test, predicted))
#print("took {} second".format(time.time()-s))
acc.append(accuracy_score(y_test, predicted))
t.append(time.time()-s)
acc, t
Out[78]:
([0.7079258010118044,
  0.712310286677909,
  0.7003372681281619,
  0.702866779089376],
 [100.84012222290039,
  100.6418559551239,
  97.06852984428406,
  104.13240194320679])
In [79]:
s = time.time()
X = word_vec
y = y
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
#print(len(X_train), len(X_test))
clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
clf.fit(X_train, y_train)
predicted = clf.predict(X_test)
#print("accuracy is ", accuracy_score(y_test, predicted))
#print("took {} second".format(time.time()-s))
acc.append(accuracy_score(y_test, predicted))
t.append(time.time()-s)
acc, t
Out[79]:
([0.7079258010118044,
  0.712310286677909,
  0.7003372681281619,
  0.702866779089376,
  0.6993254637436762],
 [100.84012222290039,
  100.6418559551239,
  97.06852984428406,
  104.13240194320679,
  101.91149282455444])
In [80]:
np.min(acc), np.mean(acc), np.max(acc)
SVM_acc = acc
SVM_time = t
RF
In [81]:
s = time.time()
X = word_vec
y = y
acc = []
t = []
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
#print(len(X_train), len(X_test))
clf = RandomForestClassifier(max_depth=20)
clf.fit(X_train, y_train)
predicted = clf.predict(X_test)
#print("accuracy is ", accuracy_score(y_test, predicted))
#print("took {} second".format(time.time()-s))
acc.append(accuracy_score(y_test, predicted))
t.append(time.time()-s)
acc, t
Out[81]:
([0.6890387858347387], [3.5418617725372314])
In [82]:
#ran on server
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
X = word_vec
y = y
iterations = 10
t = []
acc = []
for i in range(iterations):
    s = time.time()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    #print(len(X_train), len(X_test))
    clf = RandomForestClassifier(max_depth=20)
    clf.fit(X_train, y_train)
    predicted = clf.predict(X_test)
    #print("accuracy is ", accuracy_score(y_test, predicted))
    #print("took {} second".format(time.time()-s))
    acc.append(accuracy_score(y_test, predicted))
    t.append(time.time()-s)
RF_acc = acc
RF_t = t
In [83]:
RF_acc
Out[83]:
[0.6792580101180439,
 0.6851602023608769,
 0.6893760539629005,
 0.6934232715008432,
 0.6763912310286678,
 0.6871838111298483,
 0.6821247892074199,
 0.6826306913996627,
 0.6806070826306914,
 0.6833052276559866]
KNN
In [85]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=5)
In [86]:
#ran on server
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
X = word_vec
y = y
iterations = 10
t = []
acc = []
for i in range(iterations):
    s = time.time()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    #print(len(X_train), len(X_test))
    neigh = KNeighborsClassifier(n_neighbors=5)
    neigh.fit(X_train, y_train)
    predicted = neigh.predict(X_test)
    #print("accuracy is ", accuracy_score(y_test, predicted))
    #print("took {} second".format(time.time()-s))
    acc.append(accuracy_score(y_test, predicted))
    t.append(time.time()-s)
KNN_acc = acc
KNN_t = t
In [87]:
print(np.min(SVM_acc), np.mean(SVM_acc), np.max(SVM_acc))
print(np.min(KNN_acc), np.mean(KNN_acc), np.max(KNN_acc))
print(np.min(RF_acc), np.mean(RF_acc), np.max(RF_acc))
0.6993254637436762 0.7045531197301854 0.712310286677909
0.5369308600337268 0.6642495784148397 0.7040472175379426
0.6763912310286678 0.6839460370994941 0.6934232715008432
In [88]:
import numpy as np
import matplotlib.pyplot as plt
data = [[np.min(SVM_acc), np.mean(SVM_acc), np.max(SVM_acc)],
[np.min(RF_acc), np.mean(RF_acc), np.max(RF_acc)], 
[np.min(KNN_acc), np.mean(KNN_acc), np.max(KNN_acc)]]
X = np.arange(3)
fig = plt.figure()
plt.ylabel("Prediction accuracy")
plt.ylim(0, 1)
x1 = [0, 1,2,3, 4, 5]
squad = ['Minimum','Average', 'Maximum']
plt.xticks(x1, squad)
# Add the label as annotation. The "5" is the padding betweent the right side
# of the axis and the label...
plt.title("Genre (Romance | Sports | Classics) Prediction Accuracy", size=12)
plt.bar(X, data[0], color = 'gold', width = 0.25, label = 'SVM')
plt.bar(X + 0.25, data[1], color = 'navy', width = 0.25, label = 'RF')
plt.bar(X + 0.50, data[2], color = 'LightGreen', width = 0.25, label = 'KNN')
plt.legend(loc=2)
plt.savefig("prediction.png")
Distribution of ratings
In [90]:
all_ratings = []
for i in Genres_dict.values():
    for j in i:
        all_ratings.append(j)
In [91]:
fiction_ratings = Genres_dict['Fiction']
fantasy_ratings = Genres_dict['Fantasy']
romance_ratings = Genres_dict['Romance']
sports_ratings = Genres_dict['Sports']
historical_ratings = Genres_dict['Historical']
mystry_ratings = Genres_dict['Mystery']
In [92]:
ys = [fiction_ratings, fantasy_ratings, romance_ratings, sports_ratings, historical_ratings, mystry_ratings, all_ratings]
In [93]:
data = pd.DataFrame(ys).T
data.columns = ['Fiction', 'Fantasy', 'Romance', 'Sports', 'Historical', 'Mystery', "all"]
data.head()
Out[93]:
Fiction Fantasy Romance Sports Historical Mystery all
0 4.33 4.33 4.25 4.21 4.27 3.81 4.33
1 4.48 4.48 3.58 4.21 4.36 3.79 4.48
2 4.27 3.58 4.29 4.39 4.29 4.30 3.58
3 4.25 3.58 4.24 4.39 4.29 4.30 4.36
4 3.58 4.25 3.84 4.45 4.09 4.30 4.25
In [173]:
import matplotlib.pyplot as plt
import numpy as np

plt.figure(figsize=(12, 6))
data = ys #[np.random.normal(0, std, 1000) for std in range(1, 6)]
plt.ylim(0,5)
box = plt.boxplot(data, notch=True, patch_artist=True)
colors_ = ["#ef8783", "#9bc295", "#e8d197", "#90EE90", "#97c1f9", "#d1f4cb", "orange"]
plt.xticks(np.arange(8), ['', 'Fiction', 'Fantasy', 'Romance', 'Sports', 'Historical', 'Mystry', 'All genres'], rotation=0)
plt.xlabel("Genres", size=12)
plt.ylabel("Rating Dirstribution", size=12)
plt.title("Distribution of Ratings for different genres", size=15)
ax = plt.subplots()[1]

for patch, color in zip(box['boxes'], colors_):
    patch.set_facecolor(color)
ax.set_xticklabels(labels)
plt.savefig("ratings_dist.png")
plt.show()
In [149]:
goodreads = pd.read_csv('books.csv',error_bad_lines = False)
books.columns = ['book_authors', 'book_desc', 'book_edition', 'book_format', 'book_isbn',
       'book_pages', 'book_rating', 'book_rating_count', 'book_review_count',
       'title', 'genres', 'image_url']
b'Skipping line 3350: expected 12 fields, saw 13\nSkipping line 4704: expected 12 fields, saw 13\nSkipping line 5879: expected 12 fields, saw 13\nSkipping line 8981: expected 12 fields, saw 13\n'
In [150]:
merged = pd.merge(books, goodreads, on='title')
In [151]:
merged.columns
Out[151]:
Index(['book_authors', 'book_desc', 'book_edition', 'book_format', 'book_isbn',
       'book_pages', 'book_rating', 'book_rating_count', 'book_review_count',
       'title', 'genres', 'image_url', 'bookID', 'authors', 'average_rating',
       'isbn', 'isbn13', 'language_code', '  num_pages', 'ratings_count',
       'text_reviews_count', 'publication_date', 'publisher'],
      dtype='object')
In [152]:
x = merged[["title", "book_authors", "genres", "book_format", "book_rating", "book_rating_count", "book_review_count", "publication_date", "book_pages"]].tail()
x.columns = ["title", "author(s)", "genres", "format", "average_rating", "rating_count", "review_count", "publication_date", "num_pages"]
x
Out[152]:
title author(s) genres format average_rating rating_count review_count publication_date num_pages
5284 When We Were Orphans Kazuo Ishiguro Fiction|Historical|Historical Fiction|Mystery|... Paperback 3.47 21291 2037 12/1/2007 336 pages
5285 The Rachel Papers Martin Amis Fiction|Novels|Contemporary|European Literatur... Paperback 3.59 8404 397 9/29/1992 240 pages
5286 The Delta Star Joseph Wambaugh Fiction|Mystery|Mystery|Crime Paperback 3.63 755 22 1/1/1984 291 pages
5287 The Best Short Stories J.G. Ballard|Anthony Burgess Short Stories|Fiction|Science Fiction|Literature Paperback 4.20 1441 92 2/13/2001 302 pages
5288 Homegrown Democrat: A Few Plain Thoughts from ... Garrison Keillor Nonfiction|Politics|Humor|Autobiography|Memoir Paperback 3.97 1266 154 8/29/2006 288 pages
Best Authors
Preserving all the unique authors with their number of books in this dataset.
Preserving all the unique authors with their number of books in this dataset.
In [156]:
authors_freq = {}
all_authors = []
auth_ratings ={}
for i in range(len(books)):
    authors = books['book_authors'].iloc[i].split("|")
    for auth in authors:
        all_authors.append(auth)
        if auth not in authors_freq:
            authors_freq[auth] = 1
        else:
            authors_freq[auth] +=1
In [157]:
for i in range(len(books)):
    authors = books['book_authors'].iloc[i].split("|")
    rating = books.iloc[i]['book_rating']
    for auth in authors:
        auth_ratings[auth]= []
In [158]:
for i in range(len(books)):
    authors = books['book_authors'].iloc[i].split("|")
    rating = books.iloc[i]['book_rating']
    for auth in authors:
        auth_ratings[auth].append(rating)
In [159]:
auth_ratings_av = {}
for k,v in auth_ratings.items():
    auth_ratings_av[k] = np.mean(v)
In [160]:
freqs = list(authors_freq.values())
In [161]:
def find_author_with_number_of_works(i, d):
    """
    i is the number of works 
    """
    #assert isinstance(i, int)
    assert i>=0
    filtered_dict = {k:v for k,v in d.items() if v==i} #future asserrtion, comments, etc.
    return filtered_dict
In [162]:
len(find_author_with_number_of_works(1,  authors_freq))
len(find_author_with_number_of_works(2,  authors_freq))
Out[162]:
3990
In [163]:
i = 1
res = find_author_with_number_of_works(i, authors_freq)
max_rate = 0
chosen_author = ""
for k,v in res.items():
    if auth_ratings_av[k] >max_rate:
        max_rate = auth_ratings_av[k]
        chosen_author = k
print(chosen_author, max_rate, i)
Pantson Fire 5.0 1
In [164]:
result = [] #authors chosen
for i in sorted(np.unique(freqs)):
    res = find_author_with_number_of_works(i, authors_freq)
    max_rate = 0
    chosen_author = ""
    for k,v in res.items():
        if auth_ratings_av[k] >max_rate:
            max_rate = auth_ratings_av[k]
            chosen_author = k
    result.append((chosen_author, np.round(max_rate,2), i))
In [165]:
#generating random colors
import matplotlib.pyplot as plt
import random

number_of_colors =len(freqs) #len

random_colors = ["#"+''.join([random.choice('0123456789ABCDEF') for j in range(6)])
             for i in range(number_of_colors)]
In [166]:
from colorsys import hls_to_rgb

def rainbow_color_stops(n=10, end=1):
    return [ hls_to_rgb(end * i/(n-1), 0.2, 1) for i in range(n) ]
cs = rainbow_color_stops(len(freqs))
In [167]:
freq_counts = np.zeros(len(np.unique(freqs)))
for i in range(1, len(np.unique(freqs)+2)):
    freq_counts[i] = freq_counts[i-1]+freqs.count(i)
In [179]:
# libraries
import matplotlib.pyplot as plt
import numpy as np

# create data
x = np.arange(len(freqs)) 
#y =[(i)*np.random.rand()*np.random.rand() for i in (np.arange(len(freqs), 0, -1))] #np.random.rand(len(freqs)) #np.random.rand(len(freqs))
y = np.random.rand(len(freqs))
z = sorted(freqs)
#plt.text(freqs.count(1), 0.5,str(result[0]), size=15, color='red')
# use the scatter function
fig=plt.figure(figsize=(20, 15))
plt.scatter(x, y, s=z, c=second_plot_colors, alpha=0.5)
#plt.ylim(0, len(freqs)+4000) 
#plt.xlim(-500, len(freqs)+6000)
#plt.text(freqs.count(1), 0.5,str(result[0]), size=15, color='red')
plt.xlim(0, len(freqs)+10000)
plt.title("Best authors based on frequency (name, average rate, #books) | Distribution of author's number of books", size=18)
plt.xlabel("Number of authors", size=15)
for i in range(1, len(freq_counts)+1, 2):
    s = result[i-1]
    s = s[0] + " " + str(s[1]) + " "+ str(s[2])
    #plt.text(freq_counts[i]+100,1-i*0.013,str(result[i]), size=14, color='black')
    if i>10:
        plt.text(30000,1-i*0.013,str(s), size=14, color='black') #ommited
    else:
        plt.text(30000,1-i*0.013,str(s), size=14, color='black') #ommited
#plt.text(10000,0.4,"Authors with 1 book: 68.9%", size=30, color=second_plot_colors[0], rotation=90) #ommited
#plt.text(freqs.count(1)+freqs.count(2)/2,0.4,"Authors with 2 books: 13.2%", size=30, color=second_plot_colors[freqs.count(1)+1], rotation=90) #ommited
#plt.text(freqs.count(1)+freqs.count(2)+freqs.count(3)/3,0.4,"Authors with 3 books: 5.76%", size=30, color=second_plot_colors[freqs.count(1)+freqs.count(2)], rotation=90) #ommited
#plt.text(freqs.count(1)+freqs.count(2)+freqs.count(3)+freqs.count(4)/8,0.4,"Authors with 4 books: 3.19%", size=30, color=second_plot_colors[freqs.count(1)+freqs.count(2)+freqs.count(3)], rotation=90) #ommited
#plt.text(freqs.count(1)+freqs.count(2)+freqs.count(3)+freqs.count(4)+freqs.count(5)/9,0.4,"Authors with 5 books: 1.95", size=30, color=second_plot_colors[freqs.count(1)+freqs.count(2)+freqs.count(3)+freqs.count(4)], rotation=90) #ommited
plt.savefig("best_authors.png")
#plt.text(10, 25000, 'text', size=15, color='purple')
#plt.arrow(1000, 10, 10, 25000, head_width=0.05, head_length=0.03, linewidth=4, color='r', length_includes_head=True)
plt.scatter(x, y, s=z, alpha=0.5)
#to show distribution
Out[179]:
<matplotlib.collections.PathCollection at 0x7fa47447d250>
In [138]:
#x = [1]*freqs.count(1)
fig=plt.figure(figsize=(20, 10))
plt.xlabel("Number of books", size=15)
#plt.ylabel("Maximum number of authors {}".format(freqs.count(1)), size=15)
plt.ylabel("Maximum number of authors", size=15)
plt.title("Distribution of Authors based on their number of books")
#frame1 = fig.gca()
#frame1.axes.xaxis.set_ticklabels([])
#frame1.axes.yaxis.set_ticklabels([])
#plt.yticks(color='w')
#plt.axis('on')
#plt.tick_params(left='off', top='off', right='off', labelleft='off', labeltop='off', labelright='off')
#plt.tick_params(axis='both', left='on', top='off', right='off', bottom='off', labelleft='on', labeltop='off', labelright='off', labelbottom='off')
#ax.spines['left'].set_visible(False)
for i in np.unique(freqs):
    x = [i]*(freqs.count(i))
    y = list(np.random.randint(low=0, high=freqs.count(1), size=(freqs.count(i))))
    if i >1 and i < 15:
        plt.scatter(x,y, s=1)
    else:
        plt.scatter(x,y, s=5)
plt.savefig("Authors_number_of_books.png")
In [139]:
# libraries
import matplotlib.pyplot as plt
import numpy as np

colors = ["Crimson", "LightSalmon", "#FFFF00", "#663399","Orange", "#90EE90", "#808000", 
                "#1E90FF", "#0000FF", "#DAA520", "#228B22", "red", "#3CB371"]

# create dataset
height = [i/len(books)*100 for i in list(most_frequent_gens.values())]
bars = list(most_frequent_gens.keys())
y_pos = np.arange(len(bars))
 
# Create horizontal bars
plt.barh(y_pos, height,color=colors)
plt.xlabel("Percentage")
plt.ylabel("Genre")
plt.title("Percentage of Each Genre in the Total dataset")
 
# Create names on the x-axis
plt.yticks(y_pos, bars)
 
# Show graphic
#plt.show()
plt.savefig("percentage_genre.png")
Distribution of ratings
In [141]:
import matplotlib.pyplot as plt
import seaborn as sns
z = all_ratings
"""
#f = books['book_rating']
sns.kdeplot(Genres_dict["Fiction"], color="Orange", shade=False, label="Fiction")
sns.distplot(Genres_dict["Fantasy"], color="Navy", kde=True, hist=False, label="Fantasy")
sns.distplot(Genres_dict["Romance"], color="Crimson", kde=True, hist=False, label="Romance")
sns.distplot(Genres_dict["Young Adult"], color="green", kde=True, hist=False, label="Young Adult")
sns.distplot(Genres_dict["Horror"], color="purple", kde=True, hist=False, label="Horror")
sns.distplot(Genres_dict["Autobiography"], color="pink", kde=True, hist=False, label="Autobiography")
sns.distplot(Genres_dict["African American"], color="#28DA1D", kde=True, hist=False, label="African American")
sns.distplot(Genres_dict["Adventure"], color="gray", kde=True, hist=False, label="Adventure")
sns.distplot(Genres_dict["Science"], color="blue", kde=True, hist=False, label="Science")
sns.distplot(Genres_dict["Magic"], color="#EF15B8", kde=True, hist=False, label="Magic")
sns.distplot(Genres_dict["Animals"], color="#EFC515", kde=True, hist=False, label="Animals")
"""
sns.kdeplot(all_ratings, color="#15EF30", shade=True, label="All genres")
#sns.kdeplot(z, color="green", shade=True, label="All ratings")
#sns.kdeplot(f, color="Khaki", shade=False)
plt.savefig("/Users/fatemeh/Desktop/circles/rating_distribution_12.png")
plt.legend()
plt.show()
Distribution of number of books
In [142]:
import matplotlib.pyplot as plt
labels =  ["One", "Two", "Three", "Four", "5-20", ">20"]
sizes = [float(freqs.count(1)), freqs.count(2), freqs.count(3), freqs.count(4), 3539-freqs.count(4), 274+43]
#colors
colors = ['#593483','#E7DA57', '#5A3614', '#6CE2AC','salmon','#0015BC']
 
fig1, ax1 = plt.subplots()
ax1.pie(sizes, colors = colors, labels=labels, autopct='%1.1f%%', startangle=45)
#draw circle
centre_circle = plt.Circle((0,0),0.75,fc='white')
#fig = plt.figure(figsize=(10, 10))
fig = plt.gcf()
fig.gca().add_artist(centre_circle)
# Equal aspect ratio ensures that pie is drawn as a circle
ax1.axis('equal')  
plt.title("Frequency of Authors' number of books")
plt.tight_layout()
#plt.show()
plt.savefig("circle.png")
WC of all genres
In [169]:
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from wordcloud import WordCloud, ImageColorGenerator
from PIL import Image

books = pd.read_csv('book_data.csv',error_bad_lines = False)
g= []
for i in range(books.shape[0]):
    if isinstance(books['genres'].iloc[i], str) and len(books['genres'].iloc[i])>0:
        g.append(books['genres'].iloc[i])
#figuring out all the existing genres and saving the genres in Genres_list list
Genres_dict = {}
for i in range(len(books)):
    row = books.iloc[i]
    Gen = str(row.genres)
    genres = Gen.split('|')
    for genre in genres:
        if genre not in Genres_dict:
            Genres_dict[str(genre)] = 1
        else:
            Genres_dict[str(genre)] +=1
#len(Genres_list)

#word_could_dict=Counter(g)
custom_mask = np.array(Image.open("book.png"))
wordcloud = WordCloud(background_color="white", collocations=False, mask=custom_mask, contour_width=1, contour_color='gray').generate_from_frequencies(Genres_dict)
#wc = WordCloud(background_color="white", mask=custom_mask)
#wc = WordCloud(background_color="white", collocations=False, mask=custom_mask, contour_width=1, contour_color='gray')
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.savefig("wc143.png")
plt.show()